#########################################################
## @file   : tcompare.py
## @desc   : compare regexp & xpath & beautisoap
## @create : 2021/07/18
## @author : Chengan
## @email  : douboer@gmail.com
#########################################################

import json

import os
import logging
import time
from collections import defaultdict

import re
from bs4 import BeautifulSoup
from lxml import etree
import html

# log info
logger=logging.getLogger()
logger.addHandler(logging.FileHandler('log'))
logger.setLevel(logging.DEBUG)

ISDOUBAN = 1
LINKPREF = 'https://book.douban.com/subject/' \
        if ISDOUBAN else 'https://www.amazon.cn/s?k='

class bookInfoSpide():

    [re_bn,re_bn,re_score,re_star,re_author,re_description,re_end]=[None,None,None,None,None,None,None]
    if ISDOUBAN==1:
        re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\".+?rating_nums\">(.+?)<''', re.S)
        re_bn=re.compile(r'''class=\"nbg.+?sid: (\d+?),.+?title=\"(.+?)\".+?img src=\"(.+?)\"''')
        re_star=re.compile(r'''^<span class=\"allstar(\d+)\"></span>''')
        re_score=re.compile(r'''class=\"rating_nums\">(.+?)<''')
        re_ratenum=re.compile(r'''^<span>\((\d+)人评价\)</span>''')
        re_author=re.compile(r'''class=\"subject-cast\">(.+?)<''')
        re_description=re.compile(r'''^<p>(.+?)(<\/p>){0,1}$''')
    else:
        re_asin=re.compile(r'''^<div data-asin=\"(.+?)\" data-index''')
        re_img=re.compile(r'''^<img src=\"(.+?)\"$''')
        re_bn=re.compile(r'''^alt=\"(.+?)\"$''')
        re_author=re.compile(r'''^<div class=.+auto\"><\/span>.+$''')
        re_rate=re.compile(r'''^<span aria-label=\"(.+?)\">$''')
        #re_end=re.compile(r'''<\/body><\/html>''')
        re_end=re.compile(r'''^<span class=\"a-letter-space\"><\/span><\/div><\/div>''')

    def __init__(self):
        pass

    # parse response by regular express mothod
    def grab_book_info_regex(self, resp):
        """
        return: {
            "25853071": { # sid
                "link":"https://....xxxxx"
                "bookname": "庆余年",
                "img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2575362797.jpg",
                "score": "8.0",
                "ratenum": "1000",
                "author": "猫腻"
                "publisher": "中华书局"
                "publishing": "2015"
            },...}
        """

        bkinfo=defaultdict(dict)
        [sid, stat, idx] = [None, 'SID', 0]

        for line in resp.split('\n'):
            line=line.strip()
            if line=='': continue

            if stat=='SID':
                ret=re.search(self.re_bn, line)
                if ret:
                    sid=ret.group(1)+'-{}'.format(idx)
                    bkinfo[sid]['link']=os.path.join(LINKPREF,sid)
                    bkinfo[sid]['bookname']=ret.group(2)
                    bkinfo[sid]['img']=ret.group(3)
                    idx += 1
                    stat='STAR'
                continue
            elif stat=='STAR':
                ret=re.search(self.re_star, line)
                if ret:
                    star = ret.group(1)
                    if star=='00':
                        stat='AUTHOR'
                    elif int(star) > 0:
                        stat='SCORE'
            elif stat=='SCORE':
                ret=re.search(self.re_score, line)
                if ret:
                    bkinfo[sid]['score']=ret.group(1)
                    stat='RATENUM'
                continue
            elif stat=='RATENUM':
                ret=re.search(self.re_ratenum, line)
                if ret:
                    bkinfo[sid]['ratenum']=ret.group(1)
                    stat='AUTHOR'
                continue
            elif stat=='AUTHOR':
                ret=re.search(self.re_author, line)
                if ret:
                    tt=ret.group(1).split(' / ')
                    if len(tt)>=3:
                        *author, bkinfo[sid]['publisher'], bkinfo[sid]['publishing']=tt
                        bkinfo[sid]['author']='/'.join(author)
                    else:
                        bkinfo[sid]['author']=ret[0]
                    stat='DESCRIPTION'
                continue
            elif stat=='DESCRIPTION':
                ret=re.search(self.re_description, line)
                if ret:
                    bkinfo[sid]['description']=ret.group(1)
                    stat='SID'
                continue
            else: continue

        return bkinfo

    # parse response by beautiful soup mothod
    def grab_book_info_bs4(self, resp):
        #onclick="moreurl(this,{i: '0', query: '24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE', from: 'dou_search_book', sid: 3845101, qcat: ''})"
        re_sid = re.compile('^.+sid: (\d+),.+$')

        soup = BeautifulSoup(resp, "html.parser")
        bkinfo = defaultdict(dict)
        idx = 0

        #<div class="result">
        atag = soup.find_all("div", attrs={"class": "result"})
        for t in atag:
            #print(t.find_all('span'))

            # sp is type of bs4.element.Tag
            sp = t.span
            
            if not sp:
                continue
            elif '书籍' in sp.string:

                #<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F3845101%2F&amp;query=24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE&amp;cat_id=1001&amp;type=search&amp;pos=0" onclick="moreurl(this,{i: '0', query: '24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE', from: 'dou_search_book', sid: 3845101, qcat: ''})" target="_blank" title="24堂财富课"><img src="https://img9.doubanio.com/view/subject/s/public/s29357535.jpg"/></a>
                nbg = t.find('a', attrs={'class': 'nbg'})
                title = nbg['title']
                onclick = re_sid.search(nbg['onclick'])
                bookhref = nbg['href']
                imghref = nbg.img['src']

                """
                <div class="rating-info">
                <span class="allstar40"></span>
                <span class="rating_nums">8.3</span>
                <span>(218人评价)</span>
                <span class="subject-cast">陈志武 / 理想国 | 台海出版社 / 2017</span>
                </div>
                """

                nbg = t.find('div', attrs={'class': 'rating-info'})
                rate_list = [str(r).strip() for r in nbg.children if str(r).strip()!='']
                if len(rate_list) == 4:
                    star = re.search(r'class=\"(.+)\">', rate_list[0]).group(1)
                    rating_score = re.search(r'.+rating_nums\">(.+)<\/', rate_list[1]).group(1)
                    rating_number = re.search(r'.+>\((.+)\)<', rate_list[2]).group(1)
                    tt = re.search(r'.+subject-cast\">(.+)<\/', rate_list[3]).group(1).split(' / ')
                elif len(rate_list) == 3:
                    star = re.search(r'class=\"(.+)\">', rate_list[0]).group(1)
                    rating_score = 0
                    rating_number = re.search(r'.+>\((.+)\)<', rate_list[1]).group(1)
                    tt = re.search(r'.+subject-cast\">(.+)<\/', rate_list[2]).group(1).split(' / ')
                else: continue

                if len(tt)>=3:
                    *au, publisher, publishing=tt
                    author='/'.join(au)
                else:
                    author=tt[0]

                sid = '-'.join([onclick.group(1),str(idx)])

                description = str(t.p).strip('</p>')

                bkinfo[sid]['bookname'] = title
                bkinfo[sid]['link'] = bookhref
                bkinfo[sid]['img'] = imghref
                bkinfo[sid]['star'] = star
                bkinfo[sid]['score'] = rating_score
                bkinfo[sid]['ratingnum'] = rating_score
                bkinfo[sid]['author'] = author
                bkinfo[sid]['publisher'] = publisher
                bkinfo[sid]['publishing'] = publishing 
                bkinfo[sid]['description'] = description

                idx += 1
            else:
                continue

        return bkinfo

    # parse response by beautiful soup mothod
    def grab_book_info_xpath(self, resp):
        dom = etree.HTML(resp)
        result = dom.xpath("//div[@class='result']")
        #print(dom.xpath('//div[@class="result"]/div/text()'))

        idx = 0
        bkinfo = defaultdict(dict)
        for s in result:
            ts = etree.tostring(s).decode('utf8')

            re_sid = re.compile('^.+sid: (\d+),.+$')

            sp = s.xpath('div/div/h3/span[1]')
            if not sp: continue

            ts = etree.tostring(sp[0]).decode('utf8')
            typ = html.unescape(ts)
            if not '书籍' in typ: continue
            else:
                #<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F3845101%2F&amp;query=24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE&amp;cat_id=1001&amp;type=search&amp;pos=0" onclick="moreurl(this,{i: '0', query: '24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE', from: 'dou_search_book', sid: 3845101, qcat: ''})" target="_blank" title="24堂财富课"><img src="https://img9.doubanio.com/view/subject/s/public/s29357535.jpg"/></a>
                # url格式转gbk  !!!
                bookhref = s.xpath('div/a/@href')[0]
                title = s.xpath('div/a/@title')[0]
                sid = re_sid.search(str(s.xpath('div/a/@onclick')[0])).group(1)
                #sid = s.xpath('div/a/@onclick')[0]
                imghref = s.xpath('div/a/img/@src')[0]

                #rating_info = s.xpath("div[@class='rating-info']")
                ri = s.xpath("div/div/div/span")
                rating_info = list(map(lambda x : html.unescape(etree.tostring(x).decode('utf8')).strip(), ri))
                if len(rating_info) == 4:
                    star = re.search(r'class=\"(.+)\"', str(rating_info[0])).group(1)
                    rating_score = re.search(r'.+rating_nums\">(.+)<\/', rating_info[1]).group(1)
                    rating_number = re.search(r'.+>\((.+)\)<', rating_info[2]).group(1)
                    tt = re.search(r'.+subject-cast\">(.+)<\/', rating_info[3]).group(1).split(' / ')
                elif len(rating_info) == 3:
                    star = re.search(r'class=\"(.+)\"', rating_info[0]).group(1)
                    rating_score = 0
                    rating_number = re.search(r'.+>\((.+)\)<', rating_info[1]).group(1)
                    tt = re.search(r'.+subject-cast\">(.+)<\/', rating_info[2]).group(1).split(' / ')
                else: continue

                if len(tt)>=3:
                    *au, publisher, publishing=tt
                    author='/'.join(au)
                else:
                    author=tt[0]

                sid = '-'.join([sid,str(idx)])
                description = s.xpath('div/p')

                [bkinfo[sid]['bookname'],
                bkinfo[sid]['link'],
                bkinfo[sid]['img'],
                bkinfo[sid]['star'],
                bkinfo[sid]['score'],
                bkinfo[sid]['ratingnum'],
                bkinfo[sid]['author'],
                bkinfo[sid]['publisher'],
                bkinfo[sid]['publishing'],
                bkinfo[sid]['description']] = [title,
                bookhref,
                imghref,
                star,
                rating_score,
                rating_number,
                author,
                publisher,
                publishing,
                description[0].text if description else None]

                idx += 1

        return bkinfo


if __name__=='__main__':

    spide=bookInfoSpide()
    # filename:linenumber
    fnlist = { './tdouban.data.t':4032,
               './tdata':25235,
             }
    fn = list(fnlist.keys())[0]
    lnum = list(fnlist.values())[0]

    with open(fn, 'r', encoding='utf8', errors='ignore') as f:
        resp=f.read()

    """
    记录数|方法|消耗时间(secends)
    --|--|--
    4096|regex|0.012344121932983398
    4096|xpath|0.06005978584289551
    4096|beautifulsoup|0.23246979713439941
    """
    logger.debug('记录数|方法|匹配长度|消耗时间(secends)')
    logger.debug('--|--|--|--')
    [dic_count_x, dic_time_re, dic_time_xpath, dic_time_bs] = [[],[],[],[]]
    for n in range(5):
        _resp = '\n'.join([resp]*(n+1))
        linecount = lnum*(n+1)
        dic_count_x.append(linecount)

        start = time.time()
        bkinfo=spide.grab_book_info_regex(_resp)
        end = time.time()
        regex_cost = end-start
        print('regex {} lines cost time - {} seconds'.format(linecount, regex_cost))
        #logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
        logger.debug('{}|regex|{}|{}'.format(linecount,len(bkinfo),regex_cost))
        dic_time_re.append(regex_cost)

        #logger.debug('$#####################################')
        bkinfo = None
        start = time.time()
        bkinfo = spide.grab_book_info_xpath(_resp)
        end = time.time()
        xpath_cost = end-start
        print('xpath {} lines cost time - {} seconds'.format(linecount, xpath_cost))
        #logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
        logger.debug('{}|xpath|{}|{}'.format(linecount,len(bkinfo),xpath_cost))
        dic_time_xpath.append(xpath_cost)

        #print('$#####################################')
        bkinfo = None
        start = time.time()
        bkinfo = spide.grab_book_info_bs4(_resp)
        end = time.time()
        bs4_cost = end-start
        print('bs4 {} lines cost time - {} seconds \n'.format(linecount, bs4_cost))
        #logger.debug(json.dumps(bkinfo,indent=2, ensure_ascii=False))
        logger.debug('{}|bs4|{}|{}'.format(linecount,len(bkinfo),bs4_cost))
        dic_time_bs.append(bs4_cost)

        bkinfo = None

    # plot
    import matplotlib.pyplot as plt

    plt.title('REGEX VS. XPATH VS. BEAUTIFULSOUP COMPARE (Chengan)')
    plt.xlabel('Line Numbers')
    plt.ylabel('Time Cost(seconds)')
    plt.plot(dic_count_x, dic_time_re, linestyle='-', marker='o',color='m',label='REGEX')
    plt.plot(dic_count_x, dic_time_xpath, linestyle='--', marker='*',color='g',label='XPATH')
    plt.plot(dic_count_x, dic_time_bs, linestyle=':', marker='.',color='c',label='BS4')

    plt.show()


